suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ggbeeswarm))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(grid))
approaches <- c("simple", "hyperG", "propr")

env_vars <- c("depth", "latitude", "longitude", "temperature", "oxygen", "salinity")

id_map = list()

for (env_var in env_vars) {
  id_map[[env_var]] <- paste(tools::toTitleCase(env_var), "(diff.)")
}

id_map[["tip_dist"]] <- "Tip dist."

id_map[["both_gene_count"]] <- "HGT (gene count)"
id_map[["ranger_hgt_tallies"]] <- "HGT (gene count)"

id_map[["cooccur_simple_cooccur"]] <- "Co-occur (Simple)"
id_map[["cooccur_ratio"]] <- "Co-occur (HyperG)"
id_map[["cooccur_asso"]] <- "Co-occur (propr)"

clean_ids <- function(in_vec) { 
  sapply(in_vec, function(x) { id_map[[x]] })
}

comparison_type_cols <- c("#33a02c", "#1f78b4", "#e31a1c")

Initial analyses

summary_stats <- function(x) {
  n <- length(x)
  mean_val <- mean(x, na.rm = TRUE)
  median_val <- median(x, na.rm = TRUE)
  sd_val <- sd(x, na.rm = TRUE)
  se_val <- sd_val / sqrt(n)
  wilcox_out <- wilcox.test(x)
  out <- c(n, mean_val, median_val, sd_val, se_val, wilcox_out$statistic, wilcox_out$p.value)
  names(out) <- c("n", "mean", "median", "sd", "se", "Wilcox_V", "Wilcox_P")
  return(out)
}
read_corr_info <- function(pairwise_file, partial_file) {
  pairwise_in <- read.table(pairwise_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  pairwise_in$sig <- "Non-sig."
  pairwise_in$sig[which(pairwise_in$p.unc < 0.05)] <- "P < 0.05"

  partial_in <- read.table(partial_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  
  return(list(pairwise_tab=pairwise_in,
              partial_tab=partial_in))
}

Pairwise spearman

Tara ocean samples

tara_simple_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/corr_out/metaG_simple.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/corr_out/metaG_simple.partial.tsv")

tara_hyperg_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/corr_out/metaG_hyperG.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/corr_out/metaG_hyperG.partial.tsv")

tara_propr_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/corr_out/metaG_propr.pairwise.tsv",
                                partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/corr_out/metaG_propr.partial.tsv")

tara_simple_in$partial_tab$Comparison <- "Co-occur (Simple)"
tara_hyperg_in$partial_tab$Comparison <- "Co-occur (HyperG)"
tara_propr_in$partial_tab$Comparison <- "Co-occur (propr)"

# Make sure that both_gene_count vs. tip_dist comparisons are identical across all three inputs.
tara_simple_in_hgt_vs_dist <- tara_simple_in$pairwise_tab[which(tara_simple_in$pairwise_tab$X == "both_gene_count" & tara_simple_in$pairwise_tab$Y == "tip_dist"), ]
tara_hyperg_in_hgt_vs_dist <- tara_hyperg_in$pairwise_tab[which(tara_hyperg_in$pairwise_tab$X == "both_gene_count" & tara_hyperg_in$pairwise_tab$Y == "tip_dist"), ]
tara_propr_in_hgt_vs_dist <- tara_propr_in$pairwise_tab[which(tara_propr_in$pairwise_tab$X == "both_gene_count" & tara_propr_in$pairwise_tab$Y == "tip_dist"), ]
# It turns out they are slightly different -- so plot them to confirm the distributions are near-identical, and just keep one for simplicity.

tara_pairwise_combined <- rbind(tara_simple_in$pairwise_tab, tara_hyperg_in$pairwise_tab[which(tara_hyperg_in$pairwise_tab$X != "both_gene_count" | tara_hyperg_in$pairwise_tab$Y != "tip_dist"), ])
tara_pairwise_combined <- rbind(tara_pairwise_combined, tara_propr_in$pairwise_tab[which(tara_propr_in$pairwise_tab$X != "both_gene_count" | tara_propr_in$pairwise_tab$Y != "tip_dist"), ])
tara_pairwise_combined$X <- clean_ids(tara_pairwise_combined$X)
tara_pairwise_combined$Y <- clean_ids(tara_pairwise_combined$Y)
tara_pairwise_combined$Comparison_clean <- paste(tara_pairwise_combined$X, tara_pairwise_combined$Y, sep = "\nvs. ")

tara_partial_combined <- rbind(tara_simple_in$partial_tab, tara_hyperg_in$partial_tab)
tara_partial_combined <- rbind(tara_partial_combined, tara_propr_in$partial_tab)

comparison_levels <- c("HGT (gene count)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. Tip dist.",
                       "Co-occur (HyperG)\nvs. Tip dist.",
                       "Co-occur (propr)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. HGT (gene count)",
                       "Co-occur (HyperG)\nvs. HGT (gene count)",
                       "Co-occur (propr)\nvs. HGT (gene count)")

tara_pairwise_combined$Comparison_clean <- factor(tara_pairwise_combined$Comparison_clean,
                                                  levels = comparison_levels)

tara_pairwise_combined$Test_type <- "Co-occur vs. HGT"
tara_pairwise_combined[grep("HGT.*Tip dist.", tara_pairwise_combined$Comparison_clean), "Test_type"] <- "HGT vs. Tip dist."
tara_pairwise_combined[grep("Co-occur.*Tip dist.", tara_pairwise_combined$Comparison_clean), "Test_type"] <- "Co-occur vs. Tip dist."
tara_pairwise_combined$Test_type <- factor(tara_pairwise_combined$Test_type, levels = c("HGT vs. Tip dist.", "Co-occur vs. Tip dist.", "Co-occur vs. HGT"))
tara_ranger_simple_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/ranger_hgt/corr_out/metaG_simple.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/ranger_hgt/corr_out/metaG_simple.partial.tsv")

tara_ranger_hyperg_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/ranger_hgt/corr_out/metaG_hyperG.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/ranger_hgt/corr_out/metaG_hyperG.partial.tsv")

tara_ranger_propr_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/ranger_hgt/corr_out/metaG_propr.pairwise.tsv",
                                partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working/ranger_hgt/corr_out/metaG_propr.partial.tsv")

tara_ranger_simple_in$partial_tab$Comparison <- "Co-occur (Simple)"
tara_ranger_hyperg_in$partial_tab$Comparison <- "Co-occur (HyperG)"
tara_ranger_propr_in$partial_tab$Comparison <- "Co-occur (propr)"

# Make sure that both_gene_count vs. tip_dist comparisons are identical across all three inputs.
tara_ranger_simple_in_hgt_vs_dist <- tara_ranger_simple_in$pairwise_tab[which(tara_ranger_simple_in$pairwise_tab$X == "ranger_hgt_tallies" & tara_ranger_simple_in$pairwise_tab$Y == "tip_dist"), ]
tara_ranger_hyperg_in_hgt_vs_dist <- tara_ranger_hyperg_in$pairwise_tab[which(tara_ranger_hyperg_in$pairwise_tab$X == "ranger_hgt_tallies" & tara_ranger_hyperg_in$pairwise_tab$Y == "tip_dist"), ]
tara_ranger_propr_in_hgt_vs_dist <- tara_ranger_propr_in$pairwise_tab[which(tara_ranger_propr_in$pairwise_tab$X == "ranger_hgt_tallies" & tara_ranger_propr_in$pairwise_tab$Y == "tip_dist"), ]
# It turns out they are slightly different -- so plot them to confirm the distributions are near-identical, and just keep one for simplicity.

tara_ranger_pairwise_combined <- rbind(tara_ranger_simple_in$pairwise_tab, tara_ranger_hyperg_in$pairwise_tab[which(tara_ranger_hyperg_in$pairwise_tab$X != "ranger_hgt_tallies" | tara_ranger_hyperg_in$pairwise_tab$Y != "tip_dist"), ])
tara_ranger_pairwise_combined <- rbind(tara_ranger_pairwise_combined, tara_ranger_propr_in$pairwise_tab[which(tara_ranger_propr_in$pairwise_tab$X != "ranger_hgt_tallies" | tara_ranger_propr_in$pairwise_tab$Y != "tip_dist"), ])
tara_ranger_pairwise_combined$X <- clean_ids(tara_ranger_pairwise_combined$X)
tara_ranger_pairwise_combined$Y <- clean_ids(tara_ranger_pairwise_combined$Y)
tara_ranger_pairwise_combined$Comparison_clean <- paste(tara_ranger_pairwise_combined$X, tara_ranger_pairwise_combined$Y, sep = "\nvs. ")

tara_ranger_partial_combined <- rbind(tara_ranger_simple_in$partial_tab, tara_ranger_hyperg_in$partial_tab)
tara_ranger_partial_combined <- rbind(tara_ranger_partial_combined, tara_ranger_propr_in$partial_tab)

comparison_levels <- c("HGT (gene count)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. Tip dist.",
                       "Co-occur (HyperG)\nvs. Tip dist.",
                       "Co-occur (propr)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. HGT (gene count)",
                       "Co-occur (HyperG)\nvs. HGT (gene count)",
                       "Co-occur (propr)\nvs. HGT (gene count)")

tara_ranger_pairwise_combined$Comparison_clean <- factor(tara_ranger_pairwise_combined$Comparison_clean,
                                                  levels = comparison_levels)

tara_ranger_pairwise_combined$Test_type <- "Co-occur vs. HGT"
tara_ranger_pairwise_combined[grep("HGT.*Tip dist.", tara_ranger_pairwise_combined$Comparison_clean), "Test_type"] <- "HGT vs. Tip dist."
tara_ranger_pairwise_combined[grep("Co-occur.*Tip dist.", tara_ranger_pairwise_combined$Comparison_clean), "Test_type"] <- "Co-occur vs. Tip dist."
tara_ranger_pairwise_combined$Test_type <- factor(tara_ranger_pairwise_combined$Test_type, levels = c("HGT vs. Tip dist.", "Co-occur vs. Tip dist.", "Co-occur vs. HGT"))

BLAST (genus and above)

print(ggplot(data = tara_pairwise_combined, aes(x = Comparison_clean, y = r, fill = Test_type)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Spearman's correlation coefficient") +
                              xlab("Pairwise comparison") +
                              labs(fill="Comparison type") +
                              ggtitle("Tara ocean samples (BLAST - genus and above)") +
                              scale_fill_manual(values=comparison_type_cols)
)

RANGER (between strains)

print(ggplot(data = tara_ranger_pairwise_combined, aes(x = Comparison_clean, y = r, fill = Test_type)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Spearman's correlation coefficient") +
                              xlab("Pairwise comparison") +
                              labs(fill="Comparison type") +
                              ggtitle("Tara ocean samples (RANGER - between strains)") +
                              scale_fill_manual(values=comparison_type_cols)
)

HGT vs. Tip dist. (all subsets)

Note that the correlations for tip dist. vs HGT are slightly different depending on the taxon subset (which depends on the co-occurrence approach). Fortunately the distributions are very similar, so this is not something to worry about:

boxplot(tara_simple_in_hgt_vs_dist$r, tara_hyperg_in_hgt_vs_dist$r, tara_propr_in_hgt_vs_dist$r,
        names=c("simple", "hyperG", "propr"),
       main= "Spearman - tip dist. vs HGT (both gene count)")

Additional samples

extraonly_simple_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/corr_out/metaG_simple.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/corr_out/metaG_simple.partial.tsv")

extraonly_hyperg_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/corr_out/metaG_hyperG.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/corr_out/metaG_hyperG.partial.tsv")

extraonly_propr_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/corr_out/metaG_propr.pairwise.tsv",
                                partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/corr_out/metaG_propr.partial.tsv")

extraonly_simple_in$partial_tab$Comparison <- "Co-occur (Simple)"
extraonly_hyperg_in$partial_tab$Comparison <- "Co-occur (HyperG)"
extraonly_propr_in$partial_tab$Comparison <- "Co-occur (propr)"

# Make sure that both_gene_count vs. tip_dist comparisons are identical across all three inputs.
extraonly_simple_in_hgt_vs_dist <- extraonly_simple_in$pairwise_tab[which(extraonly_simple_in$pairwise_tab$X == "both_gene_count" & extraonly_simple_in$pairwise_tab$Y == "tip_dist"), ]
extraonly_hyperg_in_hgt_vs_dist <- extraonly_hyperg_in$pairwise_tab[which(extraonly_hyperg_in$pairwise_tab$X == "both_gene_count" & extraonly_hyperg_in$pairwise_tab$Y == "tip_dist"), ]
extraonly_propr_in_hgt_vs_dist <- extraonly_propr_in$pairwise_tab[which(extraonly_propr_in$pairwise_tab$X == "both_gene_count" & extraonly_propr_in$pairwise_tab$Y == "tip_dist"), ]
# It turns out they are slightly different -- so plot them to confirm the distributions are near-identical, and just keep one for simplicity.

extraonly_pairwise_combined <- rbind(extraonly_simple_in$pairwise_tab, extraonly_hyperg_in$pairwise_tab[which(extraonly_hyperg_in$pairwise_tab$X != "both_gene_count" | extraonly_hyperg_in$pairwise_tab$Y != "tip_dist"), ])
extraonly_pairwise_combined <- rbind(extraonly_pairwise_combined, extraonly_propr_in$pairwise_tab[which(extraonly_propr_in$pairwise_tab$X != "both_gene_count" | extraonly_propr_in$pairwise_tab$Y != "tip_dist"), ])
extraonly_pairwise_combined$X <- clean_ids(extraonly_pairwise_combined$X)
extraonly_pairwise_combined$Y <- clean_ids(extraonly_pairwise_combined$Y)
extraonly_pairwise_combined$Comparison_clean <- paste(extraonly_pairwise_combined$X, extraonly_pairwise_combined$Y, sep = "\nvs. ")

extraonly_partial_combined <- rbind(extraonly_simple_in$partial_tab, extraonly_hyperg_in$partial_tab)
extraonly_partial_combined <- rbind(extraonly_partial_combined, extraonly_propr_in$partial_tab)

comparison_levels <- c("HGT (gene count)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. Tip dist.",
                       "Co-occur (HyperG)\nvs. Tip dist.",
                       "Co-occur (propr)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. HGT (gene count)",
                       "Co-occur (HyperG)\nvs. HGT (gene count)",
                       "Co-occur (propr)\nvs. HGT (gene count)")

extraonly_pairwise_combined$Comparison_clean <- factor(extraonly_pairwise_combined$Comparison_clean,
                                                  levels = comparison_levels)

extraonly_pairwise_combined$Test_type <- "Co-occur vs. HGT"
extraonly_pairwise_combined[grep("HGT.*Tip dist.", extraonly_pairwise_combined$Comparison_clean), "Test_type"] <- "HGT vs. Tip dist."
extraonly_pairwise_combined[grep("Co-occur.*Tip dist.", extraonly_pairwise_combined$Comparison_clean), "Test_type"] <- "Co-occur vs. Tip dist."
extraonly_pairwise_combined$Test_type <- factor(extraonly_pairwise_combined$Test_type, levels = c("HGT vs. Tip dist.", "Co-occur vs. Tip dist.", "Co-occur vs. HGT"))
extraonly_ranger_simple_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/ranger_hgt/corr_out/metaG_simple.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/ranger_hgt/corr_out/metaG_simple.partial.tsv")

extraonly_ranger_hyperg_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/ranger_hgt/corr_out/metaG_hyperG.pairwise.tsv",
                                 partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/ranger_hgt/corr_out/metaG_hyperG.partial.tsv")

extraonly_ranger_propr_in <- read_corr_info(pairwise_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/ranger_hgt/corr_out/metaG_propr.pairwise.tsv",
                                partial_file = "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_extraonly/ranger_hgt/corr_out/metaG_propr.partial.tsv")

extraonly_ranger_simple_in$partial_tab$Comparison <- "Co-occur (Simple)"
extraonly_ranger_hyperg_in$partial_tab$Comparison <- "Co-occur (HyperG)"
extraonly_ranger_propr_in$partial_tab$Comparison <- "Co-occur (propr)"

# Make sure that both_gene_count vs. tip_dist comparisons are identical across all three inputs.
extraonly_ranger_simple_in_hgt_vs_dist <- extraonly_ranger_simple_in$pairwise_tab[which(extraonly_ranger_simple_in$pairwise_tab$X == "ranger_hgt_tallies" & extraonly_ranger_simple_in$pairwise_tab$Y == "tip_dist"), ]
extraonly_ranger_hyperg_in_hgt_vs_dist <- extraonly_ranger_hyperg_in$pairwise_tab[which(extraonly_ranger_hyperg_in$pairwise_tab$X == "ranger_hgt_tallies" & extraonly_ranger_hyperg_in$pairwise_tab$Y == "tip_dist"), ]
extraonly_ranger_propr_in_hgt_vs_dist <- extraonly_ranger_propr_in$pairwise_tab[which(extraonly_ranger_propr_in$pairwise_tab$X == "ranger_hgt_tallies" & extraonly_ranger_propr_in$pairwise_tab$Y == "tip_dist"), ]
# It turns out they are slightly different -- so plot them to confirm the distributions are near-identical, and just keep one for simplicity.

extraonly_ranger_pairwise_combined <- rbind(extraonly_ranger_simple_in$pairwise_tab, extraonly_ranger_hyperg_in$pairwise_tab[which(extraonly_ranger_hyperg_in$pairwise_tab$X != "ranger_hgt_tallies" | extraonly_ranger_hyperg_in$pairwise_tab$Y != "tip_dist"), ])
extraonly_ranger_pairwise_combined <- rbind(extraonly_ranger_pairwise_combined, extraonly_ranger_propr_in$pairwise_tab[which(extraonly_ranger_propr_in$pairwise_tab$X != "ranger_hgt_tallies" | extraonly_ranger_propr_in$pairwise_tab$Y != "tip_dist"), ])
extraonly_ranger_pairwise_combined$X <- clean_ids(extraonly_ranger_pairwise_combined$X)
extraonly_ranger_pairwise_combined$Y <- clean_ids(extraonly_ranger_pairwise_combined$Y)
extraonly_ranger_pairwise_combined$Comparison_clean <- paste(extraonly_ranger_pairwise_combined$X, extraonly_ranger_pairwise_combined$Y, sep = "\nvs. ")

extraonly_ranger_partial_combined <- rbind(extraonly_ranger_simple_in$partial_tab, extraonly_ranger_hyperg_in$partial_tab)
extraonly_ranger_partial_combined <- rbind(extraonly_ranger_partial_combined, extraonly_ranger_propr_in$partial_tab)

comparison_levels <- c("HGT (gene count)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. Tip dist.",
                       "Co-occur (HyperG)\nvs. Tip dist.",
                       "Co-occur (propr)\nvs. Tip dist.",
                       "Co-occur (Simple)\nvs. HGT (gene count)",
                       "Co-occur (HyperG)\nvs. HGT (gene count)",
                       "Co-occur (propr)\nvs. HGT (gene count)")

extraonly_ranger_pairwise_combined$Comparison_clean <- factor(extraonly_ranger_pairwise_combined$Comparison_clean,
                                                  levels = comparison_levels)

extraonly_ranger_pairwise_combined$Test_type <- "Co-occur vs. HGT"
extraonly_ranger_pairwise_combined[grep("HGT.*Tip dist.", extraonly_ranger_pairwise_combined$Comparison_clean), "Test_type"] <- "HGT vs. Tip dist."
extraonly_ranger_pairwise_combined[grep("Co-occur.*Tip dist.", extraonly_ranger_pairwise_combined$Comparison_clean), "Test_type"] <- "Co-occur vs. Tip dist."
extraonly_ranger_pairwise_combined$Test_type <- factor(extraonly_ranger_pairwise_combined$Test_type, levels = c("HGT vs. Tip dist.", "Co-occur vs. Tip dist.", "Co-occur vs. HGT"))

BLAST (genus and above)

print(ggplot(data = extraonly_pairwise_combined, aes(x = Comparison_clean, y = r, fill = Test_type)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Spearman's correlation coefficient") +
                              xlab("Pairwise comparison") +
                              labs(fill="Comparison type") +
                              ggtitle("Additional ocean samples (BLAST - genus and above)") +
                              scale_fill_manual(values=comparison_type_cols)
)

RANGER (strains)

print(ggplot(data = extraonly_ranger_pairwise_combined, aes(x = Comparison_clean, y = r, fill = Test_type)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Spearman's correlation coefficient") +
                              xlab("Pairwise comparison") +
                              labs(fill="Comparison type") +
                              ggtitle("Additional ocean samples (RANGER - between strains)") +
                              scale_fill_manual(values=comparison_type_cols)
)

HGT vs. Tip dist. (all subsets)

Note that the correlations for tip dist. vs HGT are slightly different depending on the taxon subset (which depends on the co-occurrence approach). Fortunately the distributions are very similar, so this is not something to worry about:

boxplot(extraonly_simple_in_hgt_vs_dist$r, extraonly_hyperg_in_hgt_vs_dist$r, extraonly_propr_in_hgt_vs_dist$r,
        names=c("simple", "hyperG", "propr"),
        main="Spearman - tip dist. vs HGT (both gene count)")

Pairwise partial spearman

Tara ocean samples

BLAST (genus and above)

tara_partial_combined$Comparison[which(tara_partial_combined$Comparison == "Co-occur (Simple)")] <- "Simple"
tara_partial_combined$Comparison[which(tara_partial_combined$Comparison == "Co-occur (HyperG)")] <- "HyperG"
tara_partial_combined$Comparison[which(tara_partial_combined$Comparison == "Co-occur (propr)")] <- "propr"
tara_partial_combined$Comparison <- factor(tara_partial_combined$Comparison, levels = c("Simple", "HyperG", "propr"))

print(ggplot(data = tara_partial_combined, aes(x = Comparison, y = r,)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Partial Spearman's correlation coefficient") +
                              xlab("Co-occurrence approach") +
                              ggtitle("Tara ocean samples (BLAST - genus and above)\nSpearman vs. HGT (gene count), controlling for tip dist.")
)

RANGER (strains)

tara_ranger_partial_combined$Comparison[which(tara_ranger_partial_combined$Comparison == "Co-occur (Simple)")] <- "Simple"
tara_ranger_partial_combined$Comparison[which(tara_ranger_partial_combined$Comparison == "Co-occur (HyperG)")] <- "HyperG"
tara_ranger_partial_combined$Comparison[which(tara_ranger_partial_combined$Comparison == "Co-occur (propr)")] <- "propr"
tara_ranger_partial_combined$Comparison <- factor(tara_ranger_partial_combined$Comparison, levels = c("Simple", "HyperG", "propr"))

print(ggplot(data = tara_ranger_partial_combined, aes(x = Comparison, y = r,)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Partial Spearman's correlation coefficient") +
                              xlab("Co-occurrence approach") +
                              ggtitle("Tara ocean samples (RANGER - between strains)\nSpearman vs. HGT (gene count), controlling for tip dist.")
)

Additional ocean samples

BLAST (genus and above)

extraonly_partial_combined$Comparison[which(extraonly_partial_combined$Comparison == "Co-occur (Simple)")] <- "Simple"
extraonly_partial_combined$Comparison[which(extraonly_partial_combined$Comparison == "Co-occur (HyperG)")] <- "HyperG"
extraonly_partial_combined$Comparison[which(extraonly_partial_combined$Comparison == "Co-occur (propr)")] <- "propr"
extraonly_partial_combined$Comparison <- factor(extraonly_partial_combined$Comparison, levels = c("Simple", "HyperG", "propr"))

print(ggplot(data = extraonly_partial_combined, aes(x = Comparison, y = r,)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Partial Spearman's correlation coefficient") +
                              xlab("Co-occurrence approach") +
                              ggtitle("Additional ocean samples (BLAST - genus and above)\nSpearman vs. HGT (gene count), controlling for tip dist.")
)
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 25 rows containing missing values or values outside the scale range
## (`position_quasirandom()`).

RANGER (strains)

extraonly_ranger_partial_combined$Comparison[which(extraonly_ranger_partial_combined$Comparison == "Co-occur (Simple)")] <- "Simple"
extraonly_ranger_partial_combined$Comparison[which(extraonly_ranger_partial_combined$Comparison == "Co-occur (HyperG)")] <- "HyperG"
extraonly_ranger_partial_combined$Comparison[which(extraonly_ranger_partial_combined$Comparison == "Co-occur (propr)")] <- "propr"
extraonly_ranger_partial_combined$Comparison <- factor(extraonly_ranger_partial_combined$Comparison, levels = c("Simple", "HyperG", "propr"))

print(ggplot(data = extraonly_ranger_partial_combined, aes(x = Comparison, y = r,)) +
                              geom_quasirandom(colour = "grey80", alpha = 0.5) +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Partial Spearman's correlation coefficient") +
                              xlab("Co-occurrence approach") +
                              ggtitle("Additional ocean samples (RANGER - between strains)\nSpearman vs. HGT (gene count), controlling for tip dist.")
)

All samples and variables

Pairwise Spearman correlations

mean_pairwise = list()
mean_pairwise[["BLAST"]] <- list()
mean_pairwise[["RANGER"]] <- list()

prep_pairwise_tab <- function(infile) {
  in_pairwise <- read.table(infile, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  X_tallies <- table(in_pairwise$X)
  Y_tallies <- table(in_pairwise$Y)
  in_pairwise$X <- factor(in_pairwise$X, levels = names(X_tallies)[order(X_tallies, decreasing = FALSE)])
  in_pairwise$Y <- factor(in_pairwise$Y, levels = names(Y_tallies)[order(Y_tallies, decreasing = FALSE)])
  in_pairwise_mean <- aggregate(x = r ~ X + Y, FUN = mean, data = in_pairwise)
  in_pairwise_mean_wide <- reshape2::dcast(data = in_pairwise_mean, formula = X ~ Y, value.var = 'r')
  rownames(in_pairwise_mean_wide) <- in_pairwise_mean_wide[, 1]
  
  # Also get comparisons not significantly different from 0, so that these can be coloured as grey.
  unique_combos <- in_pairwise[, c("X", "Y")]
  unique_combos <- unique_combos[-which(duplicated(unique_combos)), ]
  nonsig_comparisons <- list()
  nonsig_count <- 1
  for (i in 1:nrow(unique_combos)) {
    x_subset <- unique_combos[i, "X"]
    y_subset <- unique_combos[i, "Y"]
    combo_subset <- in_pairwise[which(in_pairwise$X == x_subset & in_pairwise$Y == y_subset), ]
    wilox_combo_out <- wilcox.test(combo_subset$r)
    if (wilox_combo_out$p.value >= 0.05) {
      print("nonsig")
      nonsig_comparisons[[nonsig_count]] <- c(y_subset, x_subset)
      nonsig_count <- nonsig_count + 1
    }
  }
  
  return(list(tab=in_pairwise_mean_wide[, -1],
              nonsig_comparisons=nonsig_comparisons))
}

for (approach in approaches) {
  infile_blast_pairwise <- paste("/mfs/gdouglas/projects/ocean_mags/coverm/network_working_allsamples/partial_corr_out/", approach, ".pairwise.tsv", sep = "")
  mean_pairwise[["BLAST"]][[approach]] <- prep_pairwise_tab(infile_blast_pairwise)
  
  infile_ranger_pairwise <- paste("/mfs/gdouglas/projects/ocean_mags/coverm/network_working_allsamples/ranger_hgt/partial_corr_out/", approach, ".pairwise.tsv", sep = "")
  mean_pairwise[["RANGER"]][[approach]] <- prep_pairwise_tab(infile_ranger_pairwise)
}

identity_sanity_checks <- c(identical(mean_pairwise[["BLAST"]]$hyperG$tab[-8, ],  mean_pairwise[["BLAST"]]$propr$tab[-8, ]),
                            identical(mean_pairwise[["BLAST"]]$simple$tab[-8, ],  mean_pairwise[["BLAST"]]$propr$tab[-8, ]),
                            identical(mean_pairwise[["RANGER"]]$hyperG$tab[-8, ],  mean_pairwise[["RANGER"]]$propr$tab[-8, ]),
                            identical(mean_pairwise[["RANGER"]]$simple$tab[-8, ],  mean_pairwise[["RANGER"]]$propr$tab[-8, ]))

if (length(which(! identity_sanity_checks)) > 0) { stop("Non-co-occurrence mismatches" )}

# Then realized afterwards that I should just combine the co-occurrence types into a single heatmap.
mean_pairwise_combined <- list()

BLAST_hyperG_row <- mean_pairwise[["BLAST"]]$hyperG$tab["cooccur_ratio", , drop = FALSE]
BLAST_propr_row <- mean_pairwise[["BLAST"]]$propr$tab["cooccur_asso", , drop = FALSE]

RANGER_hyperG_row <- mean_pairwise[["RANGER"]]$hyperG$tab["cooccur_ratio", , drop = FALSE]
RANGER_propr_row <- mean_pairwise[["RANGER"]]$propr$tab["cooccur_asso", , drop = FALSE]

mean_pairwise_combined[["BLAST"]] <- rbind(mean_pairwise[["BLAST"]]$simple$tab, BLAST_hyperG_row)
mean_pairwise_combined[["BLAST"]] <- rbind(mean_pairwise_combined[["BLAST"]], BLAST_propr_row)

mean_pairwise_combined[["RANGER"]] <- rbind(mean_pairwise[["RANGER"]]$simple$tab, RANGER_hyperG_row)
mean_pairwise_combined[["RANGER"]] <- rbind(mean_pairwise_combined[["RANGER"]], RANGER_propr_row)

BLAST results

RANGER results

Partial Spearman correlations

allsamples_partial_folder <- "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_allsamples/partial_corr_out"

allsamples_simple_partial_in <- read.table(paste(allsamples_partial_folder, "simple.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_simple_partial_tiponly_in <- read.table(paste(allsamples_partial_folder, "simple.tiponly.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_simple_partial_permuted_in <- read.table(paste(allsamples_partial_folder, "simple.permuted.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)

allsamples_simple_spearman_in <- read.table(paste(allsamples_partial_folder, "simple.pairwise.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_simple_spearman_in <- allsamples_simple_spearman_in[grep("cooccur", allsamples_simple_spearman_in$X), ]
allsamples_simple_spearman_in <- allsamples_simple_spearman_in[which(allsamples_simple_spearman_in$Y == "both_gene_count"), ]
allsamples_simple_spearman_in$p.val <- allsamples_simple_spearman_in$p.unc
allsamples_simple_spearman_in <- allsamples_simple_spearman_in[, colnames(allsamples_simple_partial_tiponly_in)]

allsamples_simple_partial_in$type <- "All var. controlled"
allsamples_simple_partial_tiponly_in$type <- "Tip dist. controlled"
allsamples_simple_partial_permuted_in$type <- "All var. controlled (permuted)"
allsamples_simple_spearman_in$type <- "No control"

allsamples_simple_combined <- do.call(rbind, list(allsamples_simple_partial_in, allsamples_simple_partial_tiponly_in, 
                                                  allsamples_simple_partial_permuted_in, allsamples_simple_spearman_in))
allsamples_simple_combined$Approach <- "Simple"

allsamples_hyperG_partial_in <- read.table(paste(allsamples_partial_folder, "hyperG.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_hyperG_partial_tiponly_in <- read.table(paste(allsamples_partial_folder, "hyperG.tiponly.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_hyperG_partial_permuted_in <- read.table(paste(allsamples_partial_folder, "hyperG.permuted.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)

allsamples_hyperG_spearman_in <- read.table(paste(allsamples_partial_folder, "hyperG.pairwise.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_hyperG_spearman_in <- allsamples_hyperG_spearman_in[grep("cooccur", allsamples_hyperG_spearman_in$X), ]
allsamples_hyperG_spearman_in <- allsamples_hyperG_spearman_in[which(allsamples_hyperG_spearman_in$Y == "both_gene_count"), ]
allsamples_hyperG_spearman_in$p.val <- allsamples_hyperG_spearman_in$p.unc
allsamples_hyperG_spearman_in <- allsamples_hyperG_spearman_in[, colnames(allsamples_hyperG_partial_tiponly_in)]

allsamples_hyperG_partial_in$type <- "All var. controlled"
allsamples_hyperG_partial_tiponly_in$type <- "Tip dist. controlled"
allsamples_hyperG_partial_permuted_in$type <- "All var. controlled (permuted)"
allsamples_hyperG_spearman_in$type <- "No control"

allsamples_hyperG_combined <- do.call(rbind, list(allsamples_hyperG_partial_in, allsamples_hyperG_partial_tiponly_in, 
                                                  allsamples_hyperG_partial_permuted_in, allsamples_hyperG_spearman_in))
allsamples_hyperG_combined$Approach <- "HyperG"

allsamples_propr_partial_in <- read.table(paste(allsamples_partial_folder, "propr.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_propr_partial_tiponly_in <- read.table(paste(allsamples_partial_folder, "propr.tiponly.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_propr_partial_permuted_in <- read.table(paste(allsamples_partial_folder, "propr.permuted.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)

allsamples_propr_spearman_in <- read.table(paste(allsamples_partial_folder, "propr.pairwise.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_propr_spearman_in <- allsamples_propr_spearman_in[grep("cooccur", allsamples_propr_spearman_in$X), ]
allsamples_propr_spearman_in <- allsamples_propr_spearman_in[which(allsamples_propr_spearman_in$Y == "both_gene_count"), ]
allsamples_propr_spearman_in$p.val <- allsamples_propr_spearman_in$p.unc
allsamples_propr_spearman_in <- allsamples_propr_spearman_in[, colnames(allsamples_propr_partial_tiponly_in)]

allsamples_propr_partial_in$type <- "All var. controlled"
allsamples_propr_partial_tiponly_in$type <- "Tip dist. controlled"
allsamples_propr_partial_permuted_in$type <- "All var. controlled (permuted)"
allsamples_propr_spearman_in$type <- "No control"

allsamples_propr_combined <- do.call(rbind, list(allsamples_propr_partial_in, allsamples_propr_partial_tiponly_in, 
                                                  allsamples_propr_partial_permuted_in, allsamples_propr_spearman_in))
allsamples_propr_combined$Approach <- "propr"

allsamples_partial_combined <- do.call(rbind, list(allsamples_simple_combined, allsamples_hyperG_combined, allsamples_propr_combined))
allsamples_partial_combined$Approach <- factor(allsamples_partial_combined$Approach, levels = c("Simple", "HyperG", "propr"))
allsamples_partial_combined$type <- factor(allsamples_partial_combined$type, levels = c("No control", "Tip dist. controlled", "All var. controlled", "All var. controlled (permuted)"))
allsamples_ranger_partial_folder <- "/mfs/gdouglas/projects/ocean_mags/coverm/network_working_allsamples/ranger_hgt/partial_corr_out"

allsamples_ranger_simple_partial_in <- read.table(paste(allsamples_ranger_partial_folder, "simple.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_simple_partial_tiponly_in <- read.table(paste(allsamples_ranger_partial_folder, "simple.tiponly.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_simple_partial_permuted_in <- read.table(paste(allsamples_ranger_partial_folder, "simple.permuted.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)

allsamples_ranger_simple_spearman_in <- read.table(paste(allsamples_ranger_partial_folder, "simple.pairwise.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_simple_spearman_in <- allsamples_ranger_simple_spearman_in[grep("cooccur", allsamples_ranger_simple_spearman_in$X), ]
allsamples_ranger_simple_spearman_in <- allsamples_ranger_simple_spearman_in[which(allsamples_ranger_simple_spearman_in$Y == "ranger_hgt_tallies"), ]
allsamples_ranger_simple_spearman_in$p.val <- allsamples_ranger_simple_spearman_in$p.unc
allsamples_ranger_simple_spearman_in <- allsamples_ranger_simple_spearman_in[, colnames(allsamples_ranger_simple_partial_tiponly_in)]

allsamples_ranger_simple_partial_in$type <- "All var. controlled"
allsamples_ranger_simple_partial_tiponly_in$type <- "Tip dist. controlled"
allsamples_ranger_simple_partial_permuted_in$type <- "All var. controlled (permuted)"
allsamples_ranger_simple_spearman_in$type <- "No control"

allsamples_ranger_simple_combined <- do.call(rbind, list(allsamples_ranger_simple_partial_in, allsamples_ranger_simple_partial_tiponly_in, 
                                                  allsamples_ranger_simple_partial_permuted_in, allsamples_ranger_simple_spearman_in))
allsamples_ranger_simple_combined$Approach <- "Simple"

allsamples_ranger_hyperG_partial_in <- read.table(paste(allsamples_ranger_partial_folder, "hyperG.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_hyperG_partial_tiponly_in <- read.table(paste(allsamples_ranger_partial_folder, "hyperG.tiponly.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_hyperG_partial_permuted_in <- read.table(paste(allsamples_ranger_partial_folder, "hyperG.permuted.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)

allsamples_ranger_hyperG_spearman_in <- read.table(paste(allsamples_ranger_partial_folder, "hyperG.pairwise.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_hyperG_spearman_in <- allsamples_ranger_hyperG_spearman_in[grep("cooccur", allsamples_ranger_hyperG_spearman_in$X), ]
allsamples_ranger_hyperG_spearman_in <- allsamples_ranger_hyperG_spearman_in[which(allsamples_ranger_hyperG_spearman_in$Y == "ranger_hgt_tallies"), ]
allsamples_ranger_hyperG_spearman_in$p.val <- allsamples_ranger_hyperG_spearman_in$p.unc
allsamples_ranger_hyperG_spearman_in <- allsamples_ranger_hyperG_spearman_in[, colnames(allsamples_ranger_hyperG_partial_tiponly_in)]

allsamples_ranger_hyperG_partial_in$type <- "All var. controlled"
allsamples_ranger_hyperG_partial_tiponly_in$type <- "Tip dist. controlled"
allsamples_ranger_hyperG_partial_permuted_in$type <- "All var. controlled (permuted)"
allsamples_ranger_hyperG_spearman_in$type <- "No control"

allsamples_ranger_hyperG_combined <- do.call(rbind, list(allsamples_ranger_hyperG_partial_in, allsamples_ranger_hyperG_partial_tiponly_in, 
                                                  allsamples_ranger_hyperG_partial_permuted_in, allsamples_ranger_hyperG_spearman_in))
allsamples_ranger_hyperG_combined$Approach <- "HyperG"

allsamples_ranger_propr_partial_in <- read.table(paste(allsamples_ranger_partial_folder, "propr.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_propr_partial_tiponly_in <- read.table(paste(allsamples_ranger_partial_folder, "propr.tiponly.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_propr_partial_permuted_in <- read.table(paste(allsamples_ranger_partial_folder, "propr.permuted.partial.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)

allsamples_ranger_propr_spearman_in <- read.table(paste(allsamples_ranger_partial_folder, "propr.pairwise.tsv", sep = "/"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
allsamples_ranger_propr_spearman_in <- allsamples_ranger_propr_spearman_in[grep("cooccur", allsamples_ranger_propr_spearman_in$X), ]
allsamples_ranger_propr_spearman_in <- allsamples_ranger_propr_spearman_in[which(allsamples_ranger_propr_spearman_in$Y == "ranger_hgt_tallies"), ]
allsamples_ranger_propr_spearman_in$p.val <- allsamples_ranger_propr_spearman_in$p.unc
allsamples_ranger_propr_spearman_in <- allsamples_ranger_propr_spearman_in[, colnames(allsamples_ranger_propr_partial_tiponly_in)]

allsamples_ranger_propr_partial_in$type <- "All var. controlled"
allsamples_ranger_propr_partial_tiponly_in$type <- "Tip dist. controlled"
allsamples_ranger_propr_partial_permuted_in$type <- "All var. controlled (permuted)"
allsamples_ranger_propr_spearman_in$type <- "No control"

allsamples_ranger_propr_combined <- do.call(rbind, list(allsamples_ranger_propr_partial_in, allsamples_ranger_propr_partial_tiponly_in, 
                                                  allsamples_ranger_propr_partial_permuted_in, allsamples_ranger_propr_spearman_in))
allsamples_ranger_propr_combined$Approach <- "propr"

allsamples_ranger_partial_combined <- do.call(rbind, list(allsamples_ranger_simple_combined, allsamples_ranger_hyperG_combined, allsamples_ranger_propr_combined))
allsamples_ranger_partial_combined$Approach <- factor(allsamples_ranger_partial_combined$Approach, levels = c("Simple", "HyperG", "propr"))
allsamples_ranger_partial_combined$type <- factor(allsamples_ranger_partial_combined$type, levels = c("No control", "Tip dist. controlled", "All var. controlled", "All var. controlled (permuted)"))

BLAST (genus and above)

allsamples_partial_combined <- allsamples_partial_combined[which(! is.na(allsamples_partial_combined$r)), ]

print(ggplot(data = allsamples_partial_combined, aes(x = Approach, y = r, fill = type)) +
                              geom_quasirandom(dodge.width=.75, alpha = 0.5, col="grey80") +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Partial Spearman's correlation coefficient") +
                              xlab("Co-occurrence approach") +
                              ggtitle("All ocean samples (BLAST - genus and above)") +
                              labs(fill="Spearman approach") +
                              scale_fill_manual(values = c("#c7733b", "#8d75ca", "#78a450", "#c16786"))
)

RANGER (strains)

allsamples_ranger_partial_combined <- allsamples_ranger_partial_combined[which(! is.na(allsamples_ranger_partial_combined$r)), ]

print(ggplot(data = allsamples_ranger_partial_combined, aes(x = Approach, y = r, fill = type)) +
                              geom_quasirandom(dodge.width=.75, alpha = 0.5, col="grey80") +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5)) +
                              ylab("Partial Spearman's correlation coefficient") +
                              xlab("Co-occurrence approach") +
                              ggtitle("All ocean samples (RANGER - between strains)") +
                              labs(fill="Spearman approach") +
                              scale_fill_manual(values = c("#c7733b", "#8d75ca", "#78a450", "#c16786"))
)

Partial Spearman correlations (HyperG only)

BLAST (genus and above)

allsamples_partial_hyperg <- allsamples_partial_combined[which(allsamples_partial_combined$Approach == 'HyperG'), ]

allsamples_partial_hyperg$type <- as.character(allsamples_partial_hyperg$type)
allsamples_partial_hyperg$type[which(allsamples_partial_hyperg$type == 'Tip dist. controlled')] <- 'Tip dist.\ncontrolled'
allsamples_partial_hyperg$type[which(allsamples_partial_hyperg$type == 'All var. controlled')] <- 'All var.\ncontrolled'
allsamples_partial_hyperg$type[which(allsamples_partial_hyperg$type == 'All var. controlled (permuted)')] <- 'All var.\ncontrolled\n(permuted)'
allsamples_partial_hyperg$type <- factor(allsamples_partial_hyperg$type,
                                         levels = c("No control", "Tip dist.\ncontrolled", "All var.\ncontrolled", "All var.\ncontrolled\n(permuted)"))

print(ggplot(data = allsamples_partial_hyperg, aes(x = type, y = r, fill = type)) +
                              geom_quasirandom(dodge.width=.75, alpha = 0.5, col="grey80") +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5),
                                    legend.position = "none") +
                              ylab("Correlation coefficient") +
                              xlab("Spearman approach") +
                              ggtitle("All ocean samples (BLAST - genus and above)") +
                              scale_fill_manual(values = c("#c7733b", "#8d75ca", "#78a450", "#c16786"))
)

RANGER (strains)

allsamples_ranger_partial_hyperg <- allsamples_ranger_partial_combined[which(allsamples_ranger_partial_combined$Approach == 'HyperG'), ]

allsamples_ranger_partial_hyperg$type <- as.character(allsamples_ranger_partial_hyperg$type)
allsamples_ranger_partial_hyperg$type[which(allsamples_ranger_partial_hyperg$type == 'Tip dist. controlled')] <- 'Tip dist.\ncontrolled'
allsamples_ranger_partial_hyperg$type[which(allsamples_ranger_partial_hyperg$type == 'All var. controlled')] <- 'All var.\ncontrolled'
allsamples_ranger_partial_hyperg$type[which(allsamples_ranger_partial_hyperg$type == 'All var. controlled (permuted)')] <- 'All var.\ncontrolled\n(permuted)'
allsamples_ranger_partial_hyperg$type <- factor(allsamples_ranger_partial_hyperg$type,
                                         levels = c("No control", "Tip dist.\ncontrolled", "All var.\ncontrolled", "All var.\ncontrolled\n(permuted)"))


print(ggplot(data = allsamples_ranger_partial_hyperg, aes(x = type, y = r, fill = type)) +
                              geom_quasirandom(dodge.width=.75, alpha = 0.5, col="grey80") +
                              geom_boxplot(outlier.shape = NA, alpha=0.7) +
                              theme_bw() +
                              theme(panel.grid.major.x = element_blank(),
                                    panel.grid.minor.x = element_blank(),
                                    plot.title = element_text(hjust = 0.5),
                                    legend.position = "none") +
                              ylab("Correlation coefficient") +
                              xlab("Spearman approach") +
                              ggtitle("All ocean samples (RANGER - between strains)") +
                              scale_fill_manual(values = c("#c7733b", "#8d75ca", "#78a450", "#c16786"))
)

Session info

Hide

Show

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Toronto
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] reshape2_1.4.4   knitr_1.47       kableExtra_1.4.0 ggbeeswarm_0.7.2
## [5] ggplot2_3.5.1    cowplot_1.1.3    circlize_0.4.16 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5          beeswarm_0.4.0        shape_1.4.6.1        
##  [4] rjson_0.2.21          xfun_0.44             bslib_0.7.0          
##  [7] GlobalOptions_0.1.2   vctrs_0.6.5           tools_4.4.0          
## [10] generics_0.1.3        stats4_4.4.0          parallel_4.4.0       
## [13] tibble_3.2.1          fansi_1.0.6           cluster_2.1.2        
## [16] highr_0.11            pkgconfig_2.0.3       RColorBrewer_1.1-3   
## [19] S4Vectors_0.42.0      lifecycle_1.0.4       compiler_4.4.0       
## [22] farver_2.1.2          stringr_1.5.1         munsell_0.5.1        
## [25] codetools_0.2-18      ComplexHeatmap_2.20.0 clue_0.3-65          
## [28] vipor_0.4.7           htmltools_0.5.8.1     sass_0.4.9           
## [31] yaml_2.3.8            pillar_1.9.0          crayon_1.5.2         
## [34] jquerylib_0.1.4       cachem_1.1.0          iterators_1.0.14     
## [37] foreach_1.5.2         tidyselect_1.2.1      digest_0.6.35        
## [40] stringi_1.8.4         dplyr_1.1.4           labeling_0.4.3       
## [43] fastmap_1.2.0         colorspace_2.1-0      cli_3.6.2            
## [46] magrittr_2.0.3        utf8_1.2.4            withr_3.0.0          
## [49] scales_1.3.0          rmarkdown_2.27        matrixStats_1.3.0    
## [52] png_0.1-8             GetoptLong_1.0.5      evaluate_0.23        
## [55] IRanges_2.38.0        doParallel_1.0.17     viridisLite_0.4.2    
## [58] rlang_1.1.3           Rcpp_1.0.12           glue_1.7.0           
## [61] xml2_1.3.6            BiocGenerics_0.50.0   svglite_2.1.3        
## [64] rstudioapi_0.16.0     jsonlite_1.8.8        R6_2.5.1             
## [67] plyr_1.8.9            systemfonts_1.1.0